0.2.6版本新增文件补充

fe851fbc · zhouxiang · e2d98ddc · fe851fbc · fe851fbc · fe851fbc
Commit fe851fbc authored Mar 24, 2024 by zhouxiang
20 changed files
--- a/tests/pytorch/engine/test_logits_process.py
+++ b/tests/pytorch/engine/test_logits_process.py
+import pytest
+import torch
+from transformers.generation.logits_process import (
+    RepetitionPenaltyLogitsProcessor, TemperatureLogitsWarper,
+    TopKLogitsWarper, TopPLogitsWarper)
+
+
+@pytest.mark.parametrize('inplace', [True, False])
+def test_process_temperature(inplace):
+    from lmdeploy.pytorch.engine.logits_process import _process_temperature
+
+    batch_size = 4
+    num_tokens = 16
+    scores = torch.rand(batch_size, num_tokens)
+    temperatures = torch.rand(batch_size)
+
+    gt = []
+    for score, temperature in zip(scores, temperatures):
+        warper = TemperatureLogitsWarper(temperature.item())
+        gt.append(warper(None, score[None]))
+    gt = torch.cat(gt)
+
+    out = _process_temperature(scores, temperatures, inplace=inplace)
+    torch.testing.assert_close(out, gt)
+
+
+@pytest.mark.parametrize('inplace', [True, False])
+def test_process_bad_words(inplace):
+    from lmdeploy.pytorch.engine.logits_process import _process_bad_words
+
+    filter_value: float = -float('inf')
+    batch_size = 4
+    num_tokens = 16
+    scores = torch.rand(batch_size, num_tokens)
+    bad_words = torch.tensor([
+        [0, 1],
+        [3, -1],
+        [4, 4],
+        [-1, -1],
+    ])
+
+    out_scores = _process_bad_words(scores, bad_words, inplace=inplace)
+
+    for score, bw in zip(out_scores, bad_words):
+        bw = bw.tolist()
+
+        for w in bw:
+            if w >= 0:
+                assert score[w] == filter_value
+
+
+@pytest.mark.parametrize('inplace', [True, False])
+def test_processrepetition_penalty(inplace):
+    from lmdeploy.pytorch.engine.logits_process import \
+        _process_repetition_penalty
+    batch_size = 4
+    num_tokens = 16
+    scores = torch.rand(batch_size, num_tokens)
+    input_ids = torch.tensor([
+        [0, 1],
+        [3, 6],
+        [4, 4],
+        [0, 0],
+    ])
+    penalties = 1 + torch.rand(batch_size)
+
+    gt = []
+    for score, ids, penalty in zip(scores, input_ids, penalties):
+        warper = RepetitionPenaltyLogitsProcessor(penalty.item())
+        gt.append(warper(ids[None], score[None].clone()))
+    gt = torch.cat(gt)
+
+    out = _process_repetition_penalty(scores,
+                                      input_ids,
+                                      penalties,
+                                      inplace=inplace)
+    torch.testing.assert_close(out, gt)
+
+
+@pytest.mark.parametrize('inplace', [True, False])
+def test_filter_topk_sorted(inplace):
+    from lmdeploy.pytorch.engine.logits_process import _filter_topk_sorted
+
+    batch_size = 4
+    num_tokens = 16
+    scores = torch.rand(batch_size, num_tokens).sort(1, descending=True)[0]
+    top_k = torch.randint(4, num_tokens - 4, (batch_size, ))
+
+    gt = []
+    for score, k in zip(scores, top_k):
+        warper = TopKLogitsWarper(k.item())
+        gt.append(warper(None, score[None].clone()))
+    gt = torch.cat(gt)
+
+    out = _filter_topk_sorted(scores, top_k, inplace=inplace)
+    torch.testing.assert_close(out, gt)
+
+
+@pytest.mark.parametrize('inplace', [True, False])
+def test_filter_topp_sorted(inplace):
+    from lmdeploy.pytorch.engine.logits_process import _filter_topp_sorted
+
+    batch_size = 4
+    num_tokens = 16
+    scores = torch.rand(batch_size, num_tokens).sort(1, descending=True)[0]
+    top_p = torch.rand(batch_size)
+
+    gt = []
+    for score, p in zip(scores, top_p):
+        warper = TopPLogitsWarper(p.item())
+        gt.append(warper(None, score[None].clone()))
+    gt = torch.cat(gt)
+
+    out = _filter_topp_sorted(scores, top_p, inplace=inplace)
+    torch.testing.assert_close(out, gt)
--- a/tests/pytorch/engine/test_request.py
+++ b/tests/pytorch/engine/test_request.py
+import asyncio
+
+import pytest
+
+from lmdeploy.pytorch.engine.request import (RequestManager, RequestType,
+                                             Response, ResponseType)
+
+
+class TestRequestHander:
+
+    @pytest.fixture
+    def event_loop(self):
+        old_loop = asyncio.get_event_loop()
+        new_loop = asyncio.new_event_loop()
+        yield new_loop
+        new_loop.stop()
+        asyncio.set_event_loop(old_loop)
+
+    @pytest.fixture
+    def thread_safe(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def manager(self, thread_safe):
+        yield RequestManager(thread_safe=thread_safe)
+
+    @pytest.mark.parametrize('thread_safe', [True, False])
+    def test_bind(self, manager, event_loop):
+
+        def __stop_engine_callback(reqs, **kwargs):
+            for req in reqs:
+                manager.response(
+                    Response(type=ResponseType.SUCCESS,
+                             sender_id=req.sender_id,
+                             req_id=req.req_id,
+                             data=f'{req.data} success'))
+
+        async def __dummy_loop():
+            while True:
+                manager.step()
+                await asyncio.sleep(0.1)
+
+        asyncio.set_event_loop(event_loop)
+        sender = manager.build_sender()
+        manager.start_loop(__dummy_loop)
+
+        # test not bind
+        req_id = sender.send_async(RequestType.STOP_ENGINE, None)
+        resp = sender.recv(req_id)
+        assert resp.type == ResponseType.HANDLER_NOT_EXIST
+
+        assert manager.is_loop_alive()
+
+        # test bind success
+        sender.send_async(RequestType.STOP_ENGINE, None)
+        manager.bind_func(RequestType.STOP_ENGINE, __stop_engine_callback)
+        req_id = sender.send_async(RequestType.STOP_ENGINE, 'test')
+        resp = sender.recv(req_id)
+        assert resp.data == 'test success'
--- a/tests/pytorch/kernel/test_apply_rotary.py
+++ b/tests/pytorch/kernel/test_apply_rotary.py
+import pytest
+import torch
+
+from lmdeploy.pytorch.kernels import apply_rotary_pos_emb
+
+
+def _rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+class TestApplyRotary:
+
+    @pytest.fixture
+    def dtype(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def batch_size(self):
+        yield 4
+
+    @pytest.fixture
+    def num_heads_q(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def num_heads_k(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def feature_dim(self):
+        yield 16
+
+    @pytest.fixture
+    def seq_length(self, batch_size):
+        yield torch.randint(8, 16, (batch_size, ), device='cuda')
+
+    @pytest.fixture
+    def max_seqlen(self, seq_length):
+        yield seq_length.max()
+
+    @pytest.fixture
+    def q_states(self, seq_length, num_heads_q, feature_dim, dtype):
+        yield torch.rand(seq_length.sum(),
+                         num_heads_q,
+                         feature_dim,
+                         dtype=dtype,
+                         device='cuda')
+
+    @pytest.fixture
+    def k_states(self, seq_length, num_heads_k, feature_dim, dtype):
+        yield torch.rand(seq_length.sum(),
+                         num_heads_k,
+                         feature_dim,
+                         dtype=dtype,
+                         device='cuda')
+
+    @pytest.fixture
+    def position_ids_1d(self, seq_length, max_seqlen):
+        yield torch.randint(0,
+                            max_seqlen.item(), (seq_length.sum().item(), ),
+                            device='cuda')
+
+    @pytest.fixture
+    def cached_cos(self, max_seqlen, feature_dim, dtype):
+        yield torch.rand(max_seqlen, feature_dim, dtype=dtype, device='cuda')
+
+    @pytest.fixture
+    def cached_sin(self, max_seqlen, feature_dim, dtype):
+        yield torch.rand(max_seqlen, feature_dim, dtype=dtype, device='cuda')
+
+    @pytest.fixture
+    def gt(self, q_states, k_states, cached_cos, cached_sin, position_ids_1d):
+        cos = cached_cos[position_ids_1d, None, :]
+        sin = cached_sin[position_ids_1d, None, :]
+
+        q_embed = q_states * cos + _rotate_half(q_states) * sin
+        k_embed = k_states * cos + _rotate_half(k_states) * sin
+
+        yield q_embed, k_embed
+
+    @pytest.mark.parametrize('dtype',
+                             [torch.bfloat16, torch.float16, torch.float32],
+                             indirect=True)
+    @pytest.mark.parametrize(('num_heads_q', 'num_heads_k'), [(8, 8), (8, 4)],
+                             indirect=True)
+    def test_apply_rotary(self, q_states, k_states, cached_cos, cached_sin,
+                          position_ids_1d, gt):
+        q_embed, k_embed = apply_rotary_pos_emb(q_states, k_states, cached_cos,
+                                                cached_sin, None,
+                                                position_ids_1d)
+        q_gt, k_gt = gt
+
+        rtol = None
+        atol = None
+        if q_states.dtype == torch.float16:
+            rtol = 1e-5
+            atol = 1e-3
+        torch.testing.assert_close(q_embed, q_gt, rtol=rtol, atol=atol)
+        torch.testing.assert_close(k_embed, k_gt, rtol=rtol, atol=atol)
--- a/tests/pytorch/kernel/test_fill_kv_cache.py
+++ b/tests/pytorch/kernel/test_fill_kv_cache.py
+import pytest
+import torch
+
+from lmdeploy.pytorch.kernels.fill_kv_cache import fill_kv_cache
+
+
+def _div_up(a, b):
+    return (a + b - 1) // b
+
+
+class TestFillKVCache:
+
+    @pytest.fixture
+    def num_heads(self):
+        yield 4
+
+    @pytest.fixture
+    def head_dim(self):
+        yield 32
+
+    @pytest.fixture
+    def block_size(self):
+        yield 16
+
+    @pytest.fixture
+    def seq_lens(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def history_lens(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def batch_size(self, seq_lens):
+        yield len(seq_lens)
+
+    @pytest.fixture
+    def kv_lens(self, seq_lens, history_lens):
+        yield [s + h for s, h in zip(seq_lens, history_lens)]
+
+    @pytest.fixture
+    def max_q_seq_length(self, seq_lens):
+        yield max(seq_lens)
+
+    @pytest.fixture
+    def num_tokens(self, seq_lens):
+        yield sum(seq_lens)
+
+    @pytest.fixture
+    def num_blocks_per_input(self, kv_lens, block_size):
+        yield [_div_up(kv_len, block_size) for kv_len in kv_lens]
+
+    @pytest.fixture
+    def max_num_blocks(self, num_blocks_per_input):
+        yield max(num_blocks_per_input)
+
+    @pytest.fixture
+    def q_seq_length(self, seq_lens):
+        yield torch.tensor(seq_lens).cuda()
+
+    @pytest.fixture
+    def q_start_loc(self, q_seq_length):
+        cum_seq_length = q_seq_length.cumsum(0)
+        yield cum_seq_length - q_seq_length
+
+    @pytest.fixture
+    def kv_seq_length(self, kv_lens):
+        yield torch.tensor(kv_lens).cuda()
+
+    @pytest.fixture
+    def k_states(self, num_tokens, num_heads, head_dim):
+        yield torch.rand(num_tokens, num_heads, head_dim).cuda()
+
+    @pytest.fixture
+    def v_states(self, k_states):
+        yield torch.rand_like(k_states)
+
+    @pytest.fixture
+    def k_caches(self, batch_size, max_num_blocks, block_size, num_heads,
+                 head_dim):
+        shape = (batch_size * max_num_blocks, block_size, num_heads, head_dim)
+        yield torch.full(shape, 0.0).cuda()
+
+    @pytest.fixture
+    def v_caches(self, k_caches):
+        yield torch.rand_like(k_caches)
+
+    @pytest.fixture
+    def block_offsets(self, num_blocks_per_input):
+        batch_size = len(num_blocks_per_input)
+        max_num_blocks = max(num_blocks_per_input)
+        batch_ids = torch.arange(batch_size)
+        ret = torch.arange(max_num_blocks)
+        ret = batch_ids[:, None] + ret[None, :] * batch_size
+        yield ret.cuda()
+
+    @pytest.fixture
+    def gt(self, k_states, v_states, k_caches, v_caches, seq_lens,
+           history_lens, block_offsets, block_size):
+        batch_size = len(seq_lens)
+        k_caches = k_caches.clone()
+        v_caches = v_caches.clone()
+        splited_k_states = k_states.split(seq_lens)
+        splited_v_states = v_states.split(seq_lens)
+        for bidx in range(batch_size):
+            k_state = splited_k_states[bidx]
+            v_state = splited_v_states[bidx]
+            h_len = history_lens[bidx]
+            b_offs = block_offsets[bidx]
+            block_id = _div_up(h_len + 1, block_size) - 1
+            fill_start = h_len % block_size
+            fill_size = min(block_size - fill_start, k_state.size(0))
+            while True:
+                boff = b_offs[block_id]
+                tmp_ks = k_state[:fill_size]
+                tmp_vs = v_state[:fill_size]
+                fill_end = fill_start + fill_size
+                k_caches[boff, fill_start:fill_end] = tmp_ks
+                v_caches[boff, fill_start:fill_end] = tmp_vs
+                k_state = k_state[fill_size:]
+                v_state = v_state[fill_size:]
+                block_id += 1
+                fill_start = 0
+                fill_size = min(block_size, k_state.size(0))
+                if fill_size == 0:
+                    break
+
+        yield k_caches, v_caches
+
+    @pytest.mark.parametrize(['seq_lens', 'history_lens'], [
+        ((1, 1, 1, 1), (1, 16, 31, 24)),
+        ((1, 8, 16, 24), (1, 16, 31, 24)),
+    ],
+                             indirect=True)
+    def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches,
+                           block_offsets, q_start_loc, q_seq_length,
+                           kv_seq_length, max_q_seq_length, gt):
+        fill_kv_cache(k_states, v_states, k_caches, v_caches, q_start_loc,
+                      q_seq_length, kv_seq_length, max_q_seq_length,
+                      block_offsets)
+
+        torch.testing.assert_close(k_caches, gt[0])
+        torch.testing.assert_close(v_caches, gt[1])
--- a/tests/pytorch/kernel/test_fused_rotary_emb.py
+++ b/tests/pytorch/kernel/test_fused_rotary_emb.py
+import pytest
+import torch
+from torch import nn
+
+from lmdeploy.pytorch.kernels.fused_rotary_emb import fused_rotary_emb
+
+
+class DummyRotaryEmbedding(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 max_position_embeddings=2048,
+                 base=10000,
+                 device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base**(torch.arange(
+            0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+
+    def forward(self, x, position_ids, seq_len=None):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(
+            position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos().to(dtype=x.dtype)
+        sin = emb.sin().to(dtype=x.dtype)
+        # backwards compatibility
+        return cos, sin
+
+
+class DummyLinearScalingRotaryEmbedding(DummyRotaryEmbedding):
+
+    def __init__(self,
+                 dim,
+                 max_position_embeddings=2048,
+                 base=10000,
+                 device=None,
+                 scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def forward(self, x, position_ids, seq_len=None):
+        position_ids = position_ids.float() / self.scaling_factor
+        cos, sin = super().forward(x, position_ids, seq_len)
+        return cos, sin
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=2):
+    """Applies Rotary Position Embedding to the query and key tensors."""
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class TestFusedRotaryEmb:
+
+    @pytest.fixture
+    def dtype(self):
+        yield torch.float16
+
+    @pytest.fixture
+    def batch_size(self):
+        yield 2
+
+    @pytest.fixture
+    def head_dim(self):
+        yield 64
+
+    @pytest.fixture
+    def q_num_heads(self):
+        yield 4
+
+    @pytest.fixture
+    def k_num_heads(self):
+        yield 2
+
+    @pytest.fixture
+    def seq_len(self):
+        yield 100
+
+    @pytest.fixture
+    def q(self, batch_size, seq_len, q_num_heads, head_dim, dtype):
+        yield torch.rand(batch_size,
+                         seq_len,
+                         q_num_heads,
+                         head_dim,
+                         dtype=dtype).to('cuda')
+
+    @pytest.fixture
+    def k(self, batch_size, seq_len, k_num_heads, head_dim, dtype):
+        yield torch.rand(batch_size,
+                         seq_len,
+                         k_num_heads,
+                         head_dim,
+                         dtype=dtype).to('cuda')
+
+    @pytest.fixture
+    def position_ids(self, batch_size, seq_len):
+        yield torch.randint(0, seq_len + 100, (batch_size, seq_len)).cuda()
+
+    @pytest.fixture
+    def rotary_emb(self, head_dim):
+        yield DummyLinearScalingRotaryEmbedding(head_dim,
+                                                scaling_factor=1.0).to('cuda')
+
+    @pytest.fixture
+    def gt(self, q, k, position_ids, rotary_emb):
+        with torch.inference_mode():
+            cos, sin = rotary_emb(q, position_ids)
+            yield apply_rotary_pos_emb(q,
+                                       k,
+                                       cos,
+                                       sin,
+                                       position_ids=position_ids)
+
+    def test_fused_rotary_emb(self, q, k, position_ids, rotary_emb, gt):
+        inv_freq = rotary_emb.inv_freq
+        scaling_factor = rotary_emb.scaling_factor
+
+        with torch.inference_mode():
+            outq, outk = fused_rotary_emb(q,
+                                          k,
+                                          position_ids,
+                                          inv_freq,
+                                          scaling_factor=scaling_factor)
+
+        gtq, gtk = gt
+        torch.testing.assert_close(outq, gtq, atol=1e-3, rtol=1e-5)
+        torch.testing.assert_close(outk, gtk, atol=1e-3, rtol=1e-5)
--- a/tests/pytorch/kernel/test_mbgmm.py
+++ b/tests/pytorch/kernel/test_mbgmm.py
+import pytest
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+from lmdeploy.pytorch.kernels.mbgmm import mbgmm_a, mbgmm_b
+
+
+class TestMBGMM:
+
+    @pytest.fixture
+    def dtype(self):
+        yield torch.float16
+
+    @pytest.fixture
+    def head_size(self):
+        yield 32
+
+    @pytest.fixture
+    def out_head_size(self):
+        yield 16
+
+    @pytest.fixture
+    def seq_lens(self):
+        yield torch.tensor([2, 4, 6, 8]).cuda()
+
+    @pytest.fixture
+    def ranks(self):
+        yield torch.tensor([2, 4]).cuda()
+
+    @pytest.fixture
+    def page_start(self, ranks):
+        yield torch.zeros_like(ranks)
+
+    @pytest.fixture
+    def start_loc(self, seq_lens):
+        yield seq_lens.cumsum(0) - seq_lens
+
+    @pytest.fixture
+    def input(self, seq_lens, head_size, dtype):
+        total_len = seq_lens.sum()
+        yield torch.rand(total_len, head_size, dtype=dtype).cuda()
+
+    @pytest.fixture
+    def adapter_ids(self, seq_lens, ranks):
+        num_ranks = len(ranks)
+        num_seqs = len(seq_lens)
+        ret = torch.randint(0, num_ranks, (num_seqs, )).cuda()
+        yield ret
+
+    @pytest.fixture
+    def scaling(self, adapter_ids):
+        yield torch.ones(adapter_ids.size(0)).cuda()
+
+    @pytest.fixture
+    def lora_a(self, ranks, head_size, dtype):
+        out = []
+        for rank in ranks:
+            w = torch.rand(head_size, rank, dtype=dtype).cuda()
+            out.append(w)
+        yield out
+
+    @pytest.fixture
+    def lora_b(self, ranks, out_head_size, dtype):
+        out = []
+        for rank in ranks:
+            w = torch.rand(rank, out_head_size, dtype=dtype).cuda()
+            out.append(w)
+        yield out
+
+    @pytest.fixture
+    def page_table(self, ranks):
+        total_ranks = sum(ranks)
+        index = torch.randperm(total_ranks)
+        index = index.split(ranks.tolist())
+        yield pad_sequence(index, batch_first=True).cuda()
+
+    @pytest.fixture
+    def paged_lora_a(self, lora_a, ranks, page_table, head_size, dtype):
+        num_pages = sum(ranks)
+        cache = torch.empty(num_pages, head_size, dtype=dtype).cuda()
+        for index, r, w in zip(page_table, ranks, lora_a):
+            cache[index[:r]] = w.t()
+        yield cache
+
+    @pytest.fixture
+    def paged_lora_b(self, lora_b, ranks, page_table, head_size, out_head_size,
+                     dtype):
+        num_pages = sum(ranks)
+        cache = torch.empty(num_pages, head_size, dtype=dtype).cuda()
+        for index, r, w in zip(page_table, ranks, lora_b):
+            cache[index[:r], :out_head_size] = w
+        yield cache
+
+    @pytest.fixture
+    def gt(self, input, start_loc, seq_lens, adapter_ids, lora_a, lora_b):
+        out = []
+        for loc, s_len, r_id in zip(start_loc, seq_lens, adapter_ids):
+            inp = input[loc:loc + s_len]
+            l_a = lora_a[r_id]
+            l_b = lora_b[r_id]
+            out.append(inp @ l_a @ l_b)
+
+        yield torch.cat(out)
+
+    def test_mbgmm(self, input, paged_lora_a, paged_lora_b, out_head_size,
+                   start_loc, seq_lens, adapter_ids, scaling, page_table,
+                   ranks, page_start, gt):
+        max_seq_len = max(seq_lens).item()
+        max_rank = page_table.size(-1)
+
+        xa = mbgmm_a(input,
+                     paged_lora_a,
+                     q_start_loc=start_loc,
+                     q_seqlens=seq_lens,
+                     adapter_ids=adapter_ids,
+                     rank_page_table=page_table,
+                     rank_page_start=page_start,
+                     ranks=ranks,
+                     max_seq_len=max_seq_len,
+                     max_rank=max_rank)
+
+        output = mbgmm_b(xa,
+                         paged_lora_b[..., :out_head_size],
+                         q_start_loc=start_loc,
+                         q_seqlens=seq_lens,
+                         adapter_ids=adapter_ids,
+                         scaling=scaling,
+                         rank_page_table=page_table,
+                         rank_page_start=page_start,
+                         ranks=ranks,
+                         max_seq_len=max_seq_len,
+                         max_rank=max_rank)
+
+        torch.testing.assert_close(gt, output)
--- a/tests/pytorch/kernel/test_mbgmv.py
+++ b/tests/pytorch/kernel/test_mbgmv.py
+import pytest
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+from lmdeploy.pytorch.kernels.mbgmv import mbgmv_a, mbgmv_b
+
+
+class TestMBGMV:
+
+    @pytest.fixture
+    def dtype(self):
+        yield torch.float16
+
+    @pytest.fixture
+    def head_size(self):
+        yield 64
+
+    @pytest.fixture
+    def out_head_size(self):
+        yield 32
+
+    @pytest.fixture
+    def batch_size(self):
+        yield 8
+
+    @pytest.fixture
+    def ranks(self):
+        yield torch.tensor([2, 4]).cuda()
+
+    @pytest.fixture
+    def page_start(self, ranks):
+        yield torch.zeros_like(ranks)
+
+    @pytest.fixture
+    def input(self, batch_size, head_size, dtype):
+        x = torch.rand(batch_size, head_size, dtype=dtype).cuda()
+        x -= 0.5
+        yield x
+
+    @pytest.fixture
+    def adapter_ids(self, batch_size, ranks):
+        num_ranks = len(ranks)
+        ret = torch.randint(0, num_ranks, (batch_size, )).cuda()
+        yield ret
+
+    @pytest.fixture
+    def scaling(self, adapter_ids):
+        yield torch.ones(adapter_ids.size(0)).cuda()
+
+    @pytest.fixture
+    def lora_a(self, ranks, head_size, dtype):
+        out = []
+        for rank in ranks:
+            w = torch.rand(head_size, rank, dtype=dtype).cuda()
+            w -= 0.5
+            out.append(w)
+        yield out
+
+    @pytest.fixture
+    def lora_b(self, ranks, out_head_size, dtype):
+        out = []
+        for rank in ranks:
+            w = torch.rand(rank, out_head_size, dtype=dtype).cuda()
+            w -= 0.5
+            out.append(w)
+        yield out
+
+    @pytest.fixture
+    def page_table(self, ranks):
+        total_ranks = sum(ranks)
+        index = torch.randperm(total_ranks)
+        index = index.split(ranks.tolist())
+        yield pad_sequence(index, batch_first=True).cuda()
+
+    @pytest.fixture
+    def paged_lora_a(self, lora_a, ranks, page_table, head_size, dtype):
+        num_pages = sum(ranks)
+        cache = torch.empty(num_pages, head_size, dtype=dtype).cuda()
+        for index, r, w in zip(page_table, ranks, lora_a):
+            cache[index[:r]] = w.t()
+        yield cache
+
+    @pytest.fixture
+    def paged_lora_b(self, lora_b, ranks, page_table, head_size, out_head_size,
+                     dtype):
+        num_pages = sum(ranks)
+        cache = torch.empty(num_pages, head_size, dtype=dtype).cuda()
+        for index, r, w in zip(page_table, ranks, lora_b):
+            cache[index[:r], :out_head_size] = w
+        yield cache
+
+    @pytest.fixture
+    def gt(self, input, adapter_ids, lora_a, lora_b):
+        out = []
+        for inp, r_id in zip(input, adapter_ids):
+            inp = inp.unsqueeze(0)
+            l_a = lora_a[r_id]
+            l_b = lora_b[r_id]
+            out.append(inp @ l_a @ l_b)
+
+        yield torch.cat(out)
+
+    def test_mbgmv(self, input, paged_lora_a, paged_lora_b, out_head_size,
+                   adapter_ids, scaling, page_table, ranks, page_start, gt):
+        max_rank = page_table.size(-1)
+
+        xa = mbgmv_a(input,
+                     paged_lora_a,
+                     adapter_ids=adapter_ids,
+                     rank_page_table=page_table,
+                     rank_page_start=page_start,
+                     ranks=ranks,
+                     max_rank=max_rank)
+
+        output = mbgmv_b(xa,
+                         paged_lora_b[..., :out_head_size],
+                         adapter_ids=adapter_ids,
+                         scaling=scaling,
+                         rank_page_table=page_table,
+                         rank_page_start=page_start,
+                         ranks=ranks,
+                         max_rank=max_rank)
+        torch.testing.assert_close(gt, output, atol=2e-3, rtol=1e-5)
--- a/tests/pytorch/kernel/test_multinomial_sampling.py
+++ b/tests/pytorch/kernel/test_multinomial_sampling.py
+import pytest
+import torch
+
+from lmdeploy.pytorch.kernels import multinomial_sampling
+
+
+class TestMultinomialSampling:
+
+    @pytest.fixture
+    def num_tokens(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def select_ids(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def batch_size(self, select_ids):
+        yield len(select_ids)
+
+    @pytest.fixture
+    def dtype(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def scores(self, num_tokens, batch_size, select_ids, dtype):
+        ret = torch.zeros(batch_size, num_tokens).cuda()
+        batch_ids = torch.arange(batch_size).cuda()
+        ret[batch_ids, select_ids] = 1
+        ret = ret.to(dtype)
+        yield ret
+
+    @pytest.fixture
+    def seeds(self, batch_size):
+        yield torch.randint(1000, 2000, (batch_size, )).cuda()
+
+    @pytest.fixture
+    def offsets(self, batch_size):
+        yield torch.randint(1000, 2000, (batch_size, )).cuda()
+
+    @pytest.fixture
+    def indices(self, scores):
+        num_tokens = scores.size(1)
+        ret = [torch.randperm(num_tokens) for _ in scores]
+        ret = torch.stack(ret, 0).cuda()
+        yield ret
+
+    @pytest.fixture
+    def gt(self, batch_size, select_ids, indices):
+        batch_ids = torch.arange(batch_size).cuda()
+        yield indices[batch_ids, select_ids]
+
+    @pytest.mark.parametrize('dtype',
+                             [torch.float32, torch.half, torch.bfloat16])
+    @pytest.mark.parametrize(['num_tokens', 'select_ids'], [
+        (8, (4, 2) * 30),
+        (200, (50, 150)),
+    ],
+                             indirect=True)
+    def test_multinomial_sampling(self, scores, seeds, offsets, indices, gt):
+        output = multinomial_sampling(scores, seeds, offsets, indices)
+        torch.testing.assert_close(output, gt)
--- a/tests/pytorch/kernel/test_paged_attention.py
+++ b/tests/pytorch/kernel/test_paged_attention.py
+import math
+
+import pytest
+import torch
+
+
+def _conti_input(data, seq_lens):
+    data = [x[:l] for x, l in zip(data, seq_lens)]
+    data = torch.cat(data, dim=0)
+    return data
+
+
+def _make_bias(seq_lens, history_lens, neg_val):
+    full_seq_lens = seq_lens + history_lens
+    max_seq_len = seq_lens.max().item()
+    max_full_len = full_seq_lens.max().item()
+    seq_ranges = [torch.arange(max_seq_len) for _ in seq_lens]
+    for r, l in zip(seq_ranges, seq_lens):
+        r[l:] = -max_full_len
+    seq_ranges = torch.stack(seq_ranges, dim=0).cuda()
+    kv_ranges = [torch.arange(max_full_len) for _ in full_seq_lens]
+    kv_ranges = torch.stack(kv_ranges, 0).cuda()
+    mask = kv_ranges[:, None, :] - seq_ranges[:, :, None] > history_lens[:,
+                                                                         None,
+                                                                         None]
+    return mask.float() * neg_val
+
+
+def _make_blocked_cache(batched_k, batched_v, seq_lens, history_lens,
+                        block_offsets, block_size, num_heads_k, feat_dim):
+    max_blocks_nums = block_offsets.max() + 1
+    full_seq_lens = seq_lens + history_lens
+    blocked_k = batched_k.new_zeros(max_blocks_nums, block_size, num_heads_k,
+                                    feat_dim)
+    blocked_v = batched_v.new_zeros(max_blocks_nums, block_size, num_heads_k,
+                                    feat_dim)
+
+    for batch_id, offset in enumerate(block_offsets):
+        ori_k = batched_k[batch_id]
+        ori_v = batched_v[batch_id]
+        seq_len = full_seq_lens[batch_id]
+        for block_id, block_start in enumerate(range(0, seq_len, block_size)):
+            block_off = offset[block_id]
+            tmp_k = ori_k[block_start:block_start + block_size]
+            tmp_v = ori_v[block_start:block_start + block_size]
+            size = tmp_k.size(0)
+            blocked_k[block_off, :size] = tmp_k
+            blocked_v[block_off, :size] = tmp_v
+
+    return blocked_k, blocked_v
+
+
+def _naive_attention(batched_q, batched_kv, bias):
+    batched_k, batched_v = batched_kv
+
+    num_heads_q = batched_q.shape[2]
+    num_heads_k = batched_k.shape[2]
+    head_dim = batched_q.shape[-1]
+    group = num_heads_q // num_heads_k
+
+    q = batched_q.transpose(1, 2)
+    k = batched_k.permute(0, 2, 3, 1)
+    v = batched_v.transpose(1, 2)
+
+    # expand group
+    k = k.unsqueeze(2).expand(-1, -1, group, -1, -1).flatten(1, 2)
+    v = v.unsqueeze(2).expand(-1, -1, group, -1, -1).flatten(1, 2)
+
+    qk = torch.matmul(q, k) / math.sqrt(head_dim)
+    attn_weight = qk + bias[:, None]
+    attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
+    attn_weight = attn_weight.to(q.dtype)
+    attn_output = torch.matmul(attn_weight, v)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output
+
+
+def _naive_window_attention(q, k, v, seqlens_q, seqlens_k, window_size):
+    from flash_attn import flash_attn_varlen_func
+
+    def _make_cu_seqlens(seqlens):
+        cu_seqlens = seqlens.cumsum(0)
+        cu_zero = cu_seqlens.new_zeros(1)
+        cu_seqlens = torch.cat([cu_zero, cu_seqlens])
+        return cu_seqlens
+
+    max_seqlen_q = seqlens_q.max().item()
+    max_seqlen_k = seqlens_k.max().item()
+    cu_seqlens_q = _make_cu_seqlens(seqlens_q).int()
+    cu_seqlens_k = _make_cu_seqlens(seqlens_k).int()
+
+    output = flash_attn_varlen_func(q,
+                                    k,
+                                    v,
+                                    cu_seqlens_q,
+                                    cu_seqlens_k,
+                                    max_seqlen_q=max_seqlen_q,
+                                    max_seqlen_k=max_seqlen_k,
+                                    causal=True,
+                                    window_size=window_size)
+    return output
+
+
+class TestPagedAttention:
+
+    @pytest.fixture
+    def dtype(self):
+        yield torch.float16
+
+    @pytest.fixture
+    def feat_dim(self):
+        yield 16
+
+    @pytest.fixture
+    def num_heads_q(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def num_heads_k(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def block_size(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def seq_lens(self, request):
+        yield torch.tensor(request.param, device='cuda')
+
+    @pytest.fixture
+    def start_loc(self, seq_lens):
+        seq_sum = seq_lens.cumsum(0)
+        start_loc = torch.cat([seq_sum.new_zeros(1), seq_sum[:-1]], dim=0)
+        yield start_loc
+
+    @pytest.fixture
+    def history_lens(self, request):
+        yield torch.tensor(request.param, device='cuda')
+
+    @pytest.fixture
+    def batched_q(self, seq_lens, num_heads_q, feat_dim, dtype):
+        torch.manual_seed(123)
+        batch_size = len(seq_lens)
+        max_seq_len = seq_lens.max().item()
+        inputs = torch.rand(batch_size,
+                            max_seq_len,
+                            num_heads_q,
+                            feat_dim,
+                            dtype=dtype,
+                            device='cuda')
+        yield inputs
+
+    @pytest.fixture
+    def batched_kv(self, seq_lens, history_lens, num_heads_k, feat_dim, dtype):
+        torch.manual_seed(123)
+        batch_size = len(seq_lens)
+        full_seq_lens = seq_lens + history_lens
+        max_seq_len = full_seq_lens.max().item()
+        k = torch.rand(batch_size,
+                       max_seq_len,
+                       num_heads_k,
+                       feat_dim,
+                       dtype=dtype,
+                       device='cuda')
+        v = torch.rand(batch_size,
+                       max_seq_len,
+                       num_heads_k,
+                       feat_dim,
+                       dtype=dtype,
+                       device='cuda')
+        yield k, v
+
+    @pytest.fixture
+    def conti_q(self, seq_lens, batched_q):
+        yield _conti_input(batched_q, seq_lens)
+
+    @pytest.fixture
+    def block_offsets(self, seq_lens, history_lens, block_size):
+        full_seq_lens = seq_lens + history_lens
+        batch_size = full_seq_lens.size(0)
+        num_blocks = (full_seq_lens + block_size - 1) // block_size
+
+        offset = [
+            torch.arange(size) * batch_size + idx
+            for idx, size in enumerate(num_blocks)
+        ]
+        max_len = max(len(o) for o in offset)
+        new_offset = offset[0].new_zeros(batch_size, max_len)
+        for o, no in zip(offset, new_offset):
+            len_o = o.size(0)
+            no[:len_o] = o
+
+        yield new_offset.cuda()
+
+    @pytest.fixture
+    def conti_kv(self, batched_kv, seq_lens, history_lens):
+        full_seq_lens = seq_lens + history_lens
+        conti_k = _conti_input(batched_kv[0], full_seq_lens)
+        conti_v = _conti_input(batched_kv[1], full_seq_lens)
+        yield (conti_k, conti_v)
+
+    @pytest.fixture
+    def blocked_kv(self, batched_kv, seq_lens, history_lens, block_offsets,
+                   block_size, num_heads_k, feat_dim):
+        batched_k, batched_v = batched_kv
+        yield _make_blocked_cache(batched_k, batched_v, seq_lens, history_lens,
+                                  block_offsets, block_size, num_heads_k,
+                                  feat_dim)
+
+    @pytest.fixture
+    def mask(self, seq_lens, history_lens):
+        neg_val = -1e30
+        yield _make_bias(seq_lens, history_lens, neg_val)
+
+    @pytest.fixture
+    def gt(self, batched_q, batched_kv, mask):
+        yield _naive_attention(batched_q, batched_kv, mask)
+
+    @pytest.fixture
+    def conti_gt(self, gt, seq_lens):
+        yield _conti_input(gt, seq_lens)
+
+    @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(4, 2)],
+                             indirect=True)
+    @pytest.mark.parametrize(['seq_lens', 'history_lens'],
+                             [([30, 50, 70, 90], [50, 40, 30, 20]),
+                              ([1, 1, 1, 1], [50, 40, 30, 20])],
+                             indirect=True)
+    @pytest.mark.parametrize('block_size', [2, 16], indirect=True)
+    def test_paged_attention(self, conti_q, blocked_kv, block_offsets,
+                             start_loc, seq_lens, history_lens, conti_gt):
+        from lmdeploy.pytorch.kernels import paged_attention_fwd
+        kv_seq_lens = seq_lens + history_lens
+        max_seq_len = seq_lens.max().item()
+
+        blocked_k, blocked_v = blocked_kv
+        out = torch.empty_like(conti_q)
+
+        paged_attention_fwd(conti_q,
+                            blocked_k,
+                            blocked_v,
+                            out,
+                            block_offsets=block_offsets,
+                            q_start_loc=start_loc,
+                            q_seqlens=seq_lens,
+                            kv_seqlens=kv_seq_lens,
+                            max_seqlen=max_seq_len)
+        torch.testing.assert_close(out, conti_gt, atol=1e-3, rtol=1e-5)
+
+    @pytest.fixture
+    def win_size(self, request):
+        yield request.param
+
+    @pytest.fixture
+    def window_gt(self, conti_q, conti_kv, seq_lens, history_lens, win_size):
+        kv_lens = seq_lens + history_lens
+        yield _naive_window_attention(conti_q,
+                                      conti_kv[0],
+                                      conti_kv[1],
+                                      seq_lens,
+                                      kv_lens,
+                                      window_size=(win_size, win_size))
+
+    @pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(4, 2)],
+                             indirect=True)
+    @pytest.mark.parametrize(['seq_lens', 'history_lens'], [
+        ([30, 50, 70, 90], [50, 40, 30, 20]),
+        ([1, 1, 1, 1], [50, 40, 30, 20]),
+    ],
+                             indirect=True)
+    @pytest.mark.parametrize('win_size', (32, ), indirect=True)
+    @pytest.mark.parametrize('block_size', [16], indirect=True)
+    def test_window_attention(self, conti_q, blocked_kv, block_offsets,
+                              start_loc, seq_lens, history_lens, win_size,
+                              window_gt):
+        from lmdeploy.pytorch.kernels import paged_attention_fwd
+        kv_seq_lens = seq_lens + history_lens
+        max_seq_len = seq_lens.max().item()
+
+        blocked_k, blocked_v = blocked_kv
+        out = torch.empty_like(conti_q)
+        paged_attention_fwd(conti_q,
+                            blocked_k,
+                            blocked_v,
+                            out,
+                            block_offsets=block_offsets,
+                            q_start_loc=start_loc,
+                            q_seqlens=seq_lens,
+                            kv_seqlens=kv_seq_lens,
+                            max_seqlen=max_seq_len,
+                            window_size=win_size)
+        torch.testing.assert_close(out, window_gt, atol=1e-3, rtol=1e-5)
--- a/tests/pytorch/kernel/test_rearange_all_gather.py
+++ b/tests/pytorch/kernel/test_rearange_all_gather.py
+import pytest
+import torch
+
+from lmdeploy.pytorch.kernels.rearange_all_gather import rearange_all_gather
+
+
+class TestRearangeAllGather:
+
+    @pytest.fixture
+    def seq_lens(self, request):
+        yield torch.tensor(request.param, device='cuda')
+
+    @pytest.fixture
+    def start_loc(self, seq_lens):
+        yield seq_lens.cumsum(0) - seq_lens
+
+    @pytest.fixture
+    def ranks(self):
+        yield torch.tensor([4, 8]).cuda()
+
+    @pytest.fixture
+    def adapter_ids(self, seq_lens, ranks):
+        num_ranks = len(ranks)
+        num_seqs = len(seq_lens)
+        ret = torch.randint(0, num_ranks, (num_seqs, )).cuda()
+        yield ret
+
+    @pytest.fixture
+    def world_size(self):
+        yield 2
+
+    @pytest.fixture
+    def input(self, seq_lens, ranks):
+        max_rank = max(ranks)
+        total_len = seq_lens.sum()
+        yield torch.rand(total_len, max_rank).cuda()
+
+    @pytest.fixture
+    def rank_per_input(self, seq_lens, ranks, adapter_ids):
+        token_adapter_ids = [
+            torch.full((slen, ), ada_id)
+            for slen, ada_id in zip(seq_lens, adapter_ids)
+        ]
+        token_adapter_ids = torch.cat(token_adapter_ids).cuda()
+        yield ranks[token_adapter_ids]
+
+    @pytest.fixture
+    def valid_mask(self, rank_per_input, seq_lens, ranks):
+        max_rank = max(ranks)
+        total_len = seq_lens.sum()
+        mask = torch.zeros(total_len, max_rank).to(bool)
+        for r, m in zip(rank_per_input, mask):
+            m[:r] = True
+        yield mask.cuda()
+
+    @pytest.fixture
+    def gt(self, input, rank_per_input, ranks, world_size):
+        max_rank = max(ranks)
+        pranks = rank_per_input // world_size
+        pmax_rank = max_rank // world_size
+        output = torch.empty_like(input)
+        for pr, inp, out in zip(pranks, input, output):
+            pindex = torch.arange(pr).cuda()
+            index = [pindex + ws * pmax_rank for ws in range(world_size)]
+            index = torch.cat(index)
+            out[:index.size(0)] = inp[index]
+        yield output
+
+    @pytest.mark.parametrize('seq_lens', [[30, 50, 70, 90], [1, 1, 1, 1]],
+                             indirect=True)
+    def test_gather(self, input, start_loc, seq_lens, adapter_ids, ranks,
+                    world_size, gt, valid_mask):
+        max_seq_len = max(seq_lens)
+        output = rearange_all_gather(input,
+                                     start_loc,
+                                     seq_lens,
+                                     adapter_ids,
+                                     ranks,
+                                     world_size,
+                                     max_seq_len=max_seq_len)
+        output = output.where(valid_mask, output.new_tensor(0))
+        gt = gt.where(valid_mask, gt.new_tensor(0))
+        torch.testing.assert_close(output, gt)
--- a/tests/pytorch/kernel/test_rms_norm.py
+++ b/tests/pytorch/kernel/test_rms_norm.py
+import pytest
+import torch
+
+
+class TestRMSNorm:
+
+    @pytest.fixture(scope='class')
+    def dtype(self, request):
+        yield request.param
+
+    @pytest.fixture(scope='class')
+    def input(self, dtype):
+        yield torch.rand(4, 8, dtype=dtype, device='cuda')
+
+    @pytest.fixture(scope='class')
+    def weight(self, dtype):
+        yield torch.rand(8, dtype=dtype, device='cuda')
+
+    @pytest.fixture(scope='class')
+    def eps(self):
+        yield 1e-6
+
+    @pytest.fixture(scope='class')
+    def gt(self, input, weight, eps):
+        input_dtype = input.dtype
+        input = input.to(torch.float32)
+        variance = input.pow(2).mean(-1, keepdim=True)
+        input = input * torch.rsqrt(variance + eps)
+        return weight * input.to(input_dtype)
+
+    @pytest.mark.parametrize('dtype',
+                             [torch.bfloat16, torch.float16, torch.float32],
+                             indirect=True)
+    def test_rms_norm(self, input, weight, eps, gt):
+        from lmdeploy.pytorch.kernels import rms_norm
+
+        out = rms_norm(input, weight, eps)
+        torch.testing.assert_close(out, gt)
--- a/tests/pytorch/paging/test_block_manager.py
+++ b/tests/pytorch/paging/test_block_manager.py
+import pytest
+import torch
+
+from lmdeploy.pytorch.messages import SchedulerSession
+from lmdeploy.pytorch.paging.block_manager import (DefaultBlockManager,
+                                                   WindowBlockManager)
+from lmdeploy.pytorch.paging.block_manager.base_block_manager import \
+    LogicalAllocator  # noqa: E501
+
+
+class TestAllocator:
+
+    @pytest.fixture
+    def num_gpu_blocks(self):
+        yield 16
+
+    @pytest.fixture
+    def num_cpu_blocks(self):
+        yield 4
+
+    @pytest.fixture
+    def allocator(self, num_cpu_blocks, num_gpu_blocks):
+        yield LogicalAllocator(num_cpu_blocks, num_gpu_blocks)
+
+    def test_alloc(self, allocator, num_cpu_blocks, num_gpu_blocks):
+
+        # initialize
+        num_blocks = num_cpu_blocks + num_gpu_blocks
+        gpu_allocator = allocator.get_phy_allocator('gpu')
+        cpu_allocator = allocator.get_phy_allocator('cpu')
+        assert allocator.get_num_free_blocks() == num_blocks
+        assert cpu_allocator.get_num_free_blocks() == num_cpu_blocks
+        assert gpu_allocator.get_num_free_blocks() == num_gpu_blocks
+
+        # test allocate
+        block_size = 4
+        blocks = allocator.allocate(block_size, 'gpu')
+        assert len(blocks) == block_size
+        assert allocator.get_num_free_blocks() == num_blocks - block_size
+        assert gpu_allocator.get_num_free_blocks(
+        ) == num_gpu_blocks - block_size
+
+        # test free
+        allocator.add_ref_count(blocks, 1)
+        allocator.free(blocks)
+        assert allocator.get_num_free_blocks() == num_blocks - block_size
+        allocator.free(blocks)
+        assert allocator.get_num_free_blocks() == num_blocks
+        assert gpu_allocator.get_num_free_blocks() == num_gpu_blocks
+        assert cpu_allocator.get_num_free_blocks() == num_cpu_blocks
+
+    def test_full(self, allocator, num_cpu_blocks, num_gpu_blocks):
+
+        num_blocks = num_cpu_blocks + num_gpu_blocks
+        gpu_allocator = allocator.get_phy_allocator('gpu')
+        cpu_allocator = allocator.get_phy_allocator('cpu')
+
+        # no free blocks
+        gpu_block_size = num_gpu_blocks
+        gpu_blocks = allocator.allocate(gpu_block_size, 'gpu')
+        cpu_block_size = num_cpu_blocks
+        cpu_blocks = allocator.allocate(cpu_block_size, 'cpu')
+        assert cpu_allocator.get_num_free_blocks() == 0
+        assert gpu_allocator.get_num_free_blocks() == 0
+        with pytest.raises(MemoryError):
+            allocator.allocate(1, 'gpu')
+        allocator.free(gpu_blocks)
+        allocator.free(cpu_blocks)
+        assert allocator.get_num_free_blocks() == num_blocks
+        assert gpu_allocator.get_num_free_blocks() == num_gpu_blocks
+        assert cpu_allocator.get_num_free_blocks() == num_cpu_blocks
+
+
+class TestDefaultBlockManager:
+
+    @pytest.fixture
+    def block_size(self):
+        yield 16
+
+    @pytest.fixture
+    def num_cpu_blocks(self):
+        yield 4
+
+    @pytest.fixture
+    def num_gpu_blocks(self):
+        yield 4
+
+    @pytest.fixture
+    def block_mgr(self, num_cpu_blocks, num_gpu_blocks):
+        yield DefaultBlockManager(num_cpu_blocks, num_gpu_blocks)
+
+    def test_alloc(self, block_mgr, block_size, num_gpu_blocks):
+        sess = SchedulerSession(0, block_size)
+
+        # test alloc
+        token_ids = torch.tensor([1])
+        msg = sess.add_sequence(token_ids)
+        assert block_mgr.can_allocate(msg)
+        block_mgr.allocate(msg)
+        block_table = block_mgr.get_block_table(msg)
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 1
+        assert block_table is not None
+        assert len(block_table) == 1
+
+        # test free
+        block_mgr.free(msg)
+        block_table = block_mgr.get_block_table(msg)
+        assert block_table is None or len(block_table) == 0
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks
+
+        # alloc over limit
+        token_ids = torch.zeros((num_gpu_blocks * block_size + 1, ),
+                                dtype=torch.int64)
+        msg = sess.add_sequence(token_ids)
+        assert not block_mgr.can_allocate(msg)
+
+    def test_append_slot(self, block_mgr, block_size, num_gpu_blocks):
+        sess = SchedulerSession(0, block_size)
+
+        # test append
+        token_ids = torch.tensor([1])
+        msg = sess.add_sequence(token_ids)
+        block_mgr.allocate(msg)
+        block_table = block_mgr.get_block_table(msg)
+        assert len(block_table) == 1
+
+        # no new logical block
+        msg.update_token_ids(torch.tensor([1] * (block_size - 1)))
+        assert block_mgr.can_append_slot(msg)
+        block_mgr.append_slot(msg)
+        block_table = block_mgr.get_block_table(msg)
+        assert len(block_table) == 1
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 1
+
+        # with new logical block
+        msg.update_token_ids(torch.tensor([1]))
+        block_mgr.append_slot(msg)
+        block_table = block_mgr.get_block_table(msg)
+        assert len(block_table) == 2
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 2
+
+    def test_fork(self, block_mgr, block_size, num_gpu_blocks):
+        sess = SchedulerSession(0, block_size)
+
+        token_ids = torch.tensor([1] * (block_size * 2 + 1))
+        from_msg = sess.add_sequence(token_ids)
+        block_mgr.allocate(from_msg)
+        from_block_table = block_mgr.get_block_table(from_msg)
+        assert len(from_block_table) == 3
+
+        to_msg = sess.fork_sequence(torch.tensor([1]), from_msg)
+
+        # fork
+        assert block_mgr.can_fork(from_msg)
+        copy_map = block_mgr.fork(from_msg, to_msg)
+        block_table = block_mgr.get_block_table(to_msg)
+        assert len(block_table) == 3
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 4
+        assert block_table[0] == from_block_table[0]
+        assert block_table[1] == from_block_table[1]
+        assert block_table[2] != from_block_table[2]
+        assert len(copy_map) == 1
+        assert copy_map[from_block_table[2]] == block_table[2]
+
+        # can not fork
+        assert not block_mgr.can_fork(from_msg)
+
+    def test_swap(self, block_mgr, block_size, num_gpu_blocks):
+        sess = SchedulerSession(0, block_size)
+
+        token_ids = torch.tensor([1] * (block_size + 1))
+        msg = sess.add_sequence(token_ids)
+        block_mgr.allocate(msg)
+
+        old_phy_blocks = block_mgr.get_block_table(msg)
+        success, swap_map = block_mgr.try_swap_out(msg)
+        new_phy_blocks = block_mgr.get_block_table(msg)
+        assert success
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks
+        assert block_mgr.get_num_free_cpu_blocks() == num_gpu_blocks - 2
+        assert len(swap_map) == 2
+        for block_id in old_phy_blocks:
+            assert block_id in swap_map
+        for block_id in new_phy_blocks:
+            assert block_id - num_gpu_blocks in swap_map.values()
+
+        old_phy_blocks = block_mgr.get_block_table(msg)
+        success, swap_map = block_mgr.try_swap_in(msg)
+        new_phy_blocks = block_mgr.get_block_table(msg)
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 2
+        assert block_mgr.get_num_free_cpu_blocks() == num_gpu_blocks
+        assert len(swap_map) == 2
+        for block_id in old_phy_blocks:
+            assert block_id - num_gpu_blocks in swap_map
+        for block_id in new_phy_blocks:
+            assert block_id in swap_map.values()
+
+        success, swap_map = block_mgr.try_swap_out(msg)
+        assert success
+        token_ids = torch.tensor([1] * (block_size * 4))
+        msg_full = sess.add_sequence(token_ids)
+        block_mgr.allocate(msg_full)
+        success, swap_map = block_mgr.try_swap_out(msg)
+        assert not success
+
+
+class TestWindowBlockManager:
+
+    @pytest.fixture
+    def window_size(self):
+        yield 32
+
+    @pytest.fixture
+    def block_size(self):
+        yield 16
+
+    @pytest.fixture
+    def num_cpu_blocks(self):
+        yield 4
+
+    @pytest.fixture
+    def num_gpu_blocks(self):
+        yield 4
+
+    @pytest.fixture
+    def block_mgr(self, num_cpu_blocks, num_gpu_blocks, window_size):
+        yield WindowBlockManager(num_cpu_blocks, num_gpu_blocks, window_size)
+
+    def test_alloc(self, block_mgr, block_size, num_gpu_blocks):
+        sess = SchedulerSession(0, block_size)
+
+        # test alloc
+        token_ids = torch.tensor([1])
+        msg = sess.add_sequence(token_ids)
+        assert block_mgr.can_allocate(msg)
+        block_mgr.allocate(msg)
+        block_table = block_mgr.get_block_table(msg)
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 1
+        assert block_table is not None
+        assert len(block_table) == 1
+
+        # test free
+        block_mgr.free(msg)
+        block_table = block_mgr.get_block_table(msg)
+        assert block_table is None or len(block_table) == 0
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks
+
+        # alloc over limit
+        token_ids = torch.zeros((num_gpu_blocks * block_size + 1, ),
+                                dtype=torch.int64)
+        msg = sess.add_sequence(token_ids)
+        assert not block_mgr.can_allocate(msg)
+
+    def test_win_alloc(self, block_mgr, block_size, num_gpu_blocks,
+                       window_size):
+        sess = SchedulerSession(0, block_size)
+
+        # 2 win block
+        token_ids = torch.tensor([1] * window_size)
+        msg = sess.add_sequence(token_ids)
+        block_mgr.allocate(msg)
+        msg.update_token_ids(torch.tensor([1]))
+        block_mgr.allocate(msg)
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 3
+        block_table = block_mgr.get_block_table(msg)
+        assert block_table is None or len(block_table) == 3
+        block_mgr.free(msg)
+
+        # 3 win block
+        token_ids = torch.tensor([1] * (window_size + 2))
+        msg = sess.add_sequence(token_ids)
+        block_mgr.allocate(msg)
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 3
+        msg.update_token_ids(torch.tensor([1]))
+        block_mgr.allocate(msg)
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 3
+        block_table = block_mgr.get_block_table(msg)
+        assert block_table is None or len(block_table) == 3
+        block_mgr.free(msg)
+
+        # not full win
+        token_ids = torch.tensor([1] * (window_size - 2))
+        msg = sess.add_sequence(token_ids)
+        block_mgr.allocate(msg)
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 2
+        msg.update_token_ids(torch.tensor([1]))
+        block_mgr.allocate(msg)
+        assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 2
+        block_table = block_mgr.get_block_table(msg)
+        assert block_table is None or len(block_table) == 2
+        block_mgr.free(msg)
--- a/tests/pytorch/paging/test_scheduler.py
+++ b/tests/pytorch/paging/test_scheduler.py
+import pytest
+import torch
+
+from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig
+from lmdeploy.pytorch.messages import MessageStatus
+from lmdeploy.pytorch.paging.scheduler import Scheduler
+
+
+class TestScheduler:
+
+    @pytest.fixture
+    def block_size(self):
+        yield 16
+
+    @pytest.fixture
+    def num_cpu_blocks(self):
+        yield 4
+
+    @pytest.fixture
+    def num_gpu_blocks(self):
+        yield 4
+
+    @pytest.fixture
+    def cache_config(self, block_size, num_cpu_blocks, num_gpu_blocks):
+        yield CacheConfig(block_size=block_size,
+                          num_cpu_blocks=num_cpu_blocks,
+                          num_gpu_blocks=num_gpu_blocks)
+
+    @pytest.fixture
+    def scheduler_config(self):
+        yield SchedulerConfig(max_batches=4,
+                              max_session_len=128,
+                              max_request_output_len=64,
+                              eviction_type='copy')
+
+    @pytest.fixture
+    def scheduler(self, cache_config, scheduler_config):
+        yield Scheduler(scheduler_config=scheduler_config,
+                        cache_config=cache_config)
+
+    def test_schedule_base(self, scheduler, block_size, num_gpu_blocks):
+        block_manager = scheduler.block_manager
+        session_id = 0
+        session = scheduler.add_session(session_id)
+        assert session_id in scheduler.sessions
+        assert scheduler.sessions[session_id] == session
+
+        num_blocks = 2
+        token_ids = torch.tensor([0] * block_size * num_blocks)
+        seq = session.add_sequence(token_ids)
+        scheduler.add_sequence(seq)
+
+        assert seq.status == MessageStatus.WAITING
+        assert seq in scheduler.waiting
+
+        output = scheduler.schedule(is_prefill=True)
+        block_tables = scheduler.get_block_tables(output.running)
+
+        assert seq.status == MessageStatus.RUNNING
+        assert seq in output.running
+        assert len(block_tables) == 1
+        assert len(block_tables[0]) == num_blocks
+        assert block_manager.get_num_free_gpu_blocks(
+        ) == num_gpu_blocks - num_blocks
+
+        assert scheduler.has_unfinished()
+
+    def test_update(self, scheduler, block_size, num_gpu_blocks):
+        block_manager = scheduler.block_manager
+        session_id1 = 0
+        session1 = scheduler.add_session(session_id1)
+        token_ids1 = torch.tensor([0] * block_size * 1)
+        seq1 = session1.add_sequence(token_ids1)
+        scheduler.add_sequence(seq1)
+
+        session_id2 = 1
+        session2 = scheduler.add_session(session_id2)
+        token_ids2 = torch.tensor([0] * block_size * 2)
+        seq2 = session2.add_sequence(token_ids2)
+        scheduler.add_sequence(seq2)
+        token_ids3 = torch.tensor([0] * block_size * 3)
+        seq3 = session2.add_sequence(token_ids3)
+        scheduler.add_sequence(seq3)
+
+        scheduler.schedule(is_prefill=True)
+        assert seq1.status == MessageStatus.RUNNING
+        assert seq2.status == MessageStatus.RUNNING
+        assert seq3.status == MessageStatus.WAITING
+
+        # stop seq
+        seq1.status = MessageStatus.STOPPED
+        scheduler.update()
+        assert len(scheduler.running) == 1
+        assert seq1 in scheduler.hanging
+
+        # end seq
+        seq1.status = MessageStatus.ENDED
+        scheduler.update()
+        assert session_id1 in scheduler.sessions
+        assert seq1 not in scheduler.running
+        assert seq1 not in scheduler.hanging
+        assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks - 2
+
+        # stop session
+        scheduler.stop_session(session_id2)
+        scheduler.update()
+        assert len(scheduler.running) == 0
+        assert len(scheduler.waiting) == 0
+        assert len(scheduler.hanging) == 2
+
+        # end session
+        scheduler.end_session(session_id2)
+        scheduler.update()
+        assert seq2.status == MessageStatus.ENDED
+        assert seq3.status == MessageStatus.ENDED
+        assert session_id2 not in scheduler.sessions
+        assert len(scheduler.hanging) == 0
+        assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
+
+    def test_swap(self, scheduler, block_size, num_gpu_blocks, num_cpu_blocks):
+        block_manager = scheduler.block_manager
+        session_id = 0
+        session = scheduler.add_session(session_id)
+
+        # test: add 3 seq
+        token_ids1 = torch.tensor([0] * block_size * 1)
+        seq1 = session.add_sequence(token_ids1)
+        scheduler.add_sequence(seq1)
+        token_ids2 = torch.tensor([0] * block_size * 2)
+        seq2 = session.add_sequence(token_ids2)
+        scheduler.add_sequence(seq2)
+        token_ids3 = torch.tensor([0] * block_size * 3)
+        seq3 = session.add_sequence(token_ids3)
+        scheduler.add_sequence(seq3)
+        scheduler.schedule(is_prefill=True)
+        # seq1: 1 running gpu
+        # seq2: 2 running gpu
+        # seq3: 3 waiting empty
+        assert seq1.status == MessageStatus.RUNNING
+        assert seq2.status == MessageStatus.RUNNING
+        assert seq3.status == MessageStatus.WAITING
+        assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks - 3
+
+        # test: waiting alloc
+        seq2.status = MessageStatus.STOPPED
+        scheduler.update()
+        assert len(scheduler.running) == 1
+        assert len(scheduler.waiting) == 1
+        assert len(scheduler.hanging) == 1
+
+        output = scheduler.schedule(is_prefill=True)
+        # seq1: 1 running gpu
+        # seq2: 2 hanging cpu
+        # seq3: 3 waiting gpu
+        assert seq1.status == MessageStatus.RUNNING
+        assert seq2.status == MessageStatus.STOPPED
+        assert seq3.status == MessageStatus.RUNNING
+        assert block_manager.get_num_free_gpu_blocks() == 0
+        assert block_manager.get_num_free_cpu_blocks() == num_cpu_blocks - 2
+        assert len(output.swap_out_map) == 2
+
+        # test: waiting append token
+        seq2.status = MessageStatus.WAITING
+        seq3.status = MessageStatus.ENDED
+        seq2.update_token_ids(torch.tensor([1] * block_size))
+        scheduler.update()
+        assert len(scheduler.running) == 1
+        assert len(scheduler.waiting) == 1
+        assert len(scheduler.hanging) == 0
+
+        output = scheduler.schedule(is_prefill=True)
+        # seq1: 1 running gpu
+        # seq2: 3 running gpu
+        # seq3: 3 nan
+        assert seq1.status == MessageStatus.RUNNING
+        assert seq2.status == MessageStatus.RUNNING
+        assert block_manager.get_num_free_gpu_blocks() == 0
+        assert block_manager.get_num_free_cpu_blocks() == num_cpu_blocks
+        assert len(output.swap_in_map) == 2
+
+        # test running append
+        seq1.update_token_ids(torch.tensor([1] * block_size))
+        seq2.update_token_ids(torch.tensor([1] * block_size))
+        scheduler.update()
+        assert len(scheduler.running) == 2
+        output = scheduler.schedule(is_prefill=False)
+        # seq1: 1 waiting cpu
+        # seq2: 4 running gpu
+        # seq3: 3 nan
+        assert seq1.status == MessageStatus.WAITING
+        assert seq2.status == MessageStatus.RUNNING
+        assert block_manager.get_num_free_gpu_blocks() == 0
+        assert block_manager.get_num_free_cpu_blocks() == num_cpu_blocks - 1
+        assert len(output.swap_out_map) == 1
--- a/tests/pytorch/tools/test_layout_convert.py
+++ b/tests/pytorch/tools/test_layout_convert.py
+import pytest
+import torch
+
+from lmdeploy.pytorch.tools.layout_convert import (batch_tensor,
+                                                   continuous_tensor)
+
+
+class TestContinuous:
+
+    @pytest.fixture
+    def batched_tensor(self):
+        yield torch.tensor([[1, 2, 3, 0, 0], [4, 5, 6, 7, 8], [9, 10, 0, 0,
+                                                               0]])
+
+    @pytest.fixture
+    def seq_len(self):
+        yield torch.tensor([3, 5, 2])
+
+    @pytest.fixture
+    def conti_tensor(self):
+        yield torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
+
+    def test_conti_tensor(self, batched_tensor, seq_len, conti_tensor):
+        conti_out = continuous_tensor(batched_tensor, seq_len)
+        torch.testing.assert_close(conti_out, conti_tensor)
+
+        batched_out = batch_tensor(conti_tensor, seq_len)
+        torch.testing.assert_close(batched_out, batched_tensor)
--- a/tests/pytorch/tools/test_make_inputs.py
+++ b/tests/pytorch/tools/test_make_inputs.py
+import pytest
+import torch
+
+from lmdeploy.pytorch.tools.make_inputs import (make_model_inputs,
+                                                make_step_context)
+
+
+class TestMakeInputs:
+
+    @pytest.fixture
+    def seq_length(self):
+        yield torch.tensor([2, 4, 3])
+
+    @pytest.fixture
+    def history_length(self):
+        yield [10, 12, 6]
+
+    @pytest.fixture
+    def input_ids(self, seq_length):
+        batch_size = len(seq_length)
+        max_seq_len = max(seq_length)
+        yield torch.randint(0, 100, (batch_size, max_seq_len))
+
+    @pytest.fixture
+    def block_size(self):
+        yield 4
+
+    @pytest.fixture
+    def num_key_value_heads(self):
+        yield 1
+
+    @pytest.fixture
+    def head_size(self):
+        yield 4
+
+    @pytest.fixture
+    def kv_cache_dtype(self):
+        yield torch.float16
+
+    @pytest.fixture
+    def past_key_values(self, history_length, num_key_value_heads, head_size):
+        max_len = max(history_length)
+        batch_size = len(history_length)
+        k_cache = torch.rand(batch_size, num_key_value_heads, max_len,
+                             head_size)
+        v_cache = k_cache + 1
+        yield [(k_cache, v_cache)]
+
+    def test_make_inputs(self, input_ids, seq_length, history_length):
+        model_inputs = make_model_inputs(input_ids,
+                                         seq_length=seq_length,
+                                         block_offsets=None,
+                                         history_length=history_length)
+        position_ids = torch.tensor([
+            [10, 11, 11, 11],
+            [12, 13, 14, 15],
+            [6, 7, 8, 8],
+        ])
+        q_start_loc = torch.tensor([0, 2, 6])
+        torch.testing.assert_close(model_inputs.position_ids, position_ids)
+        torch.testing.assert_close(model_inputs.q_start_loc, q_start_loc)
+
+    def test_make_step_context(self, input_ids, seq_length, history_length,
+                               past_key_values, block_size,
+                               num_key_value_heads, head_size, kv_cache_dtype):
+        step_ctx = make_step_context(input_ids,
+                                     seq_length=seq_length,
+                                     history_length=history_length,
+                                     past_key_values=past_key_values,
+                                     world_size=1,
+                                     device='cuda',
+                                     block_size=block_size,
+                                     num_key_value_heads=num_key_value_heads,
+                                     head_size=head_size,
+                                     kv_cache_dtype=kv_cache_dtype)
+        block_offsets = step_ctx.block_offsets
+        assert block_offsets[0][3] == 0
+        assert block_offsets[1][3] != 0
+        assert block_offsets[2][3] == 0
+
+        kv_caches = step_ctx.kv_caches
+        assert len(kv_caches) == len(past_key_values)
--- a/tests/test_lmdeploy/test_async_engine.py
+++ b/tests/test_lmdeploy/test_async_engine.py
+import pytest
+
+from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig
+from lmdeploy.model import ChatTemplateConfig
+from lmdeploy.serve.async_engine import deduce_a_name
+
+
+@pytest.mark.parametrize(
+    'backend_config',
+    [TurbomindEngineConfig('internlm'),
+     PytorchEngineConfig(None), None])
+@pytest.mark.parametrize(
+    'chat_template_config',
+    [ChatTemplateConfig('internlm'),
+     ChatTemplateConfig(None), None])
+@pytest.mark.parametrize('model_name', ['internlm', None])
+@pytest.mark.parametrize('model_path', ['/path/to/internlm-chat-7b'])
+def test_deduce_a_name(model_path, model_name, chat_template_config,
+                       backend_config):
+    name = deduce_a_name(model_path, model_name, chat_template_config,
+                         backend_config)
+    assert name == 'internlm'
--- a/tests/test_lmdeploy/test_auto_backend.py
+++ b/tests/test_lmdeploy/test_auto_backend.py
+import os
+import tempfile
+
+import numpy as np
+import pytest
+
+
+class TestAutoBackend:
+
+    @pytest.fixture
+    def turbomind_workspace(self):
+        workspace = tempfile.TemporaryDirectory(
+            'internlm-chat-7b-turbomind').name
+        os.makedirs(os.path.join(workspace, 'triton_models'), exist_ok=True)
+        return workspace
+
+    @pytest.fixture
+    def models(self):
+        # example models to test
+        # format (model_path, is_pytorch_supported, is_turbomind_supported)
+        models = [
+            ('baichuan-inc/Baichuan-7B', False, True),
+            ('baichuan-inc/Baichuan2-7B-Chat', True, True),
+            ('baichuan-inc/Baichuan-13B-Chat', False, False),
+            ('baichuan-inc/Baichuan2-13B-Chat', True, False),
+            ('internlm/internlm-chat-7b', True, True),
+            ('internlm/internlm2-chat-7b', True, True),
+            ('internlm/internlm-xcomposer2-7b', False, False),
+            ('internlm/internlm-xcomposer-7b', False, True),
+            ('THUDM/chatglm2-6b', True, False),
+            ('THUDM/chatglm3-6b', True, False),
+            ('deepseek-ai/deepseek-moe-16b-chat', True, False),
+            ('tiiuae/falcon-7b-instruct', True, False),
+            ('01-ai/Yi-34B-Chat', True, True),
+            ('codellama/CodeLlama-7b-Instruct-hf', True, True),
+            ('mistralai/Mistral-7B-Instruct-v0.1', True, False),
+            ('mistralai/Mixtral-8x7B-Instruct-v0.1', True, False),
+            ('Qwen/Qwen-7B-Chat', False, True),
+            ('Qwen/Qwen-VL-Chat', False, True),
+            ('Qwen/Qwen1.5-4B-Chat', True, False),
+        ]
+        return models
+
+    def test_pytorch_is_suppored(self, turbomind_workspace, models):
+        from lmdeploy.pytorch.supported_models import is_supported
+        assert is_supported(turbomind_workspace) is False
+        for m, flag, _ in models:
+            assert is_supported(m) is flag
+
+    def test_turbomind_is_suppored(self, turbomind_workspace, models):
+        from lmdeploy.turbomind.supported_models import is_supported
+        assert is_supported(turbomind_workspace) is True
+        for m, _, flag in models:
+            assert is_supported(m) is flag
+
+    def test_autoget_backend(self, turbomind_workspace, models):
+        from lmdeploy.archs import autoget_backend
+        assert autoget_backend(turbomind_workspace) == 'turbomind'
+        n = len(models)
+        choices = np.random.choice(n, n // 2, replace=False)
+        for i in choices:
+            model, is_support_pytorch, is_support_turbomind = models[i]
+            target = 'turbomind' if is_support_turbomind else 'pytorch'
+            backend = autoget_backend(model)
+            assert backend == target
+
+    def test_autoget_backend_config(self, turbomind_workspace):
+        from lmdeploy.archs import autoget_backend_config
+        from lmdeploy.messages import (PytorchEngineConfig,
+                                       TurbomindEngineConfig)
+        assert type(autoget_backend_config(
+            turbomind_workspace)) is TurbomindEngineConfig
+        assert type(autoget_backend_config(
+            'internlm/internlm-chat-7b')) is TurbomindEngineConfig
+        assert type(
+            autoget_backend_config(
+                'mistralai/Mistral-7B-Instruct-v0.1')) is PytorchEngineConfig
--- a/tests/test_lmdeploy/test_get_model.py
+++ b/tests/test_lmdeploy/test_get_model.py
+import os
+
+import pytest
+
+from lmdeploy.turbomind.utils import get_model_from_config
+
+
+@pytest.mark.parametrize('item',
+                         [('baichuan-inc/Baichuan-7B', 'baichuan'),
+                          ('baichuan-inc/Baichuan2-7B-Base', 'baichuan2'),
+                          ('internlm/internlm2-7b', 'internlm2'),
+                          ('internlm/internlm2-chat-7b', 'internlm2'),
+                          ('internlm/internlm2-math-20b', 'internlm2'),
+                          ('internlm/internlm-20b', 'llama'),
+                          ('NousResearch/Llama-2-7b-chat-hf', 'llama'),
+                          ('Qwen/Qwen-7B-Chat', 'qwen'),
+                          ('Qwen/Qwen-14B', 'qwen'),
+                          ('NousResearch/Nous-Hermes-2-SOLAR-10.7B', 'llama'),
+                          ('01-ai/Yi-34B-Chat', 'llama')])
+def test_get_model_from_config(item):
+    from transformers.utils import cached_file
+    model_id, result = item
+    local_file = cached_file(model_id, 'config.json')
+    local_dir = os.path.dirname(local_file)
+    print(get_model_from_config(local_dir))
+    assert get_model_from_config(local_dir) == result
--- a/tests/test_lmdeploy/test_messages.py
+++ b/tests/test_lmdeploy/test_messages.py
+from typing import List
+
+from lmdeploy import EngineGenerationConfig, GenerationConfig, Tokenizer
+
+
+def test_engine_generation_config():
+    tokenizer = Tokenizer('internlm/internlm-chat-7b')
+    config = GenerationConfig(n=3, stop_words=['<eoa>'])
+    _config = EngineGenerationConfig.From(config, tokenizer)
+
+    assert _config.n == config.n == 3 and \
+        _config.max_new_tokens == config.max_new_tokens and \
+        _config.temperature == config.temperature
+    assert isinstance(_config.stop_words, List) and \
+        isinstance(_config.stop_words[0], int)
--- a/tests/test_lmdeploy/test_vl_template.py
+++ b/tests/test_lmdeploy/test_vl_template.py
+import PIL
+
+from lmdeploy.model import MODELS
+from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.vl.templates import VLChatTemplateWrapper
+
+
+def test_prompt_to_messages():
+    model = MODELS.get('vicuna')()
+    templtae = VLChatTemplateWrapper(model)
+    out = templtae.prompt_to_messages('hi')
+    assert isinstance(out, list) and isinstance(out[0], dict)
+    im = PIL.Image.new(mode='RGB', size=(200, 200))
+    out = templtae.prompt_to_messages(('hi', [im]))
+    assert isinstance(out, list) and isinstance(out[0], dict)
+
+
+def test_messages2prompt():
+    model = MODELS.get('vicuna')()
+    templtae = VLChatTemplateWrapper(model)
+    messages = [{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'hi'
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url': 'xxx'
+            }
+        }]
+    }]
+    prompt = templtae.messages2prompt(messages)
+    assert isinstance(prompt, str)
+    assert prompt.count(IMAGE_TOKEN) == 1